
## Use naive bayes to look for more instances of praise in the eu data
## This script assumes that the current working directory is the top level of the dir


## code hacked from "Calculate jihad scores for tawhed.R"
## 
library(tm)
library(topicmodels)



##################################################
## Automates the process of putting docs into a DTM

makedtm <- function(loc, bell=T, tolower=F, stemming = F, stopwords = F, 
                    minWordLength = 1, removeNumbers = F, removePunctuation=F){
  require(tm)
  # get the paths to the text documents
  print("getting the paths")
  paths <- DirSource(loc)
  print(paste("there are",paths$Length," documents"))
  # I skip if there are too many fatwas 
  #if(paths$Length > 3000){next}
  print("constructing the corpus")
  (myCorpus <- Corpus(paths))
  print("makin the dtm")
  # first, get the document names
  docids <- rep(NA,length(paths$FileList))
  for(i in 1:length(docids)){
    path <- paths$FileList[i]
    temp <- gsub(paste(loc,"/",sep=""),"",path)
    docids[i] <- gsub(".txt","",temp)
  }
  mydtm <- DocumentTermMatrix(myCorpus, 
         control = list(tolower=tolower, stemming = stemming, stopwords = stopwords, 
                        minWordLength = minWordLength, removeNumbers = removeNumbers,
                        removePunctuation = removePunctuation))
  ## make the rownames to be the document ids
  rownames(mydtm) <- docids
  # ring a bell and return the dtm  
  if(bell){alarm()}
  return(mydtm)
}

## END function for making dtms automatically
######################################################

######################################################
## line up dtms

## Now keep only the terms in the training data
lineup <- function(dtm1, dtm2){
  if(is.data.frame(dtm1)==F | is.data.frame(dtm2)==F){
    stop("dtms must be data frames")
  }
  in.training <- names(dtm1)%in%names(dtm2)
  classifyme <- c()
  for(i in 2:ncol(dtm1)){
    if(in.training[i]){
      classifyme <- cbind(classifyme, dtm2[,names(dtm1)[i]])
    }
    else {
      classifyme <- cbind(classifyme, rep(0,nrow(dtm2)))
    }
  }
  colnames(classifyme) <- colnames(dtm1)[-1]
  rownames(classifyme) <- rownames(dtm2)
  return(as.data.frame(classifyme))
}

#######################################################

##########################################################
## naive bayes function

nbayes <- function(mat, class){
  V <- colnames(mat)
  B <- length(V)
  C <- unique(class)
  ## For each class, calculate the word probabilities given the class
  wordprobs <- matrix(NA,B,length(C))
  rownames(wordprobs) <- colnames(mat)
  colnames(wordprobs) <- C
  wordprobs
  for(i in 1:length(C)){
    sub.mat <- mat[class==C[i],]
    wordsums <- apply(sub.mat,MAR=2, sum)
    ## add the laplace prior of +1
    wordprobs[,i] <- (wordsums + 1)/(sum(sub.mat) + B)
  }
  return(wordprobs)
}

## For apply
nbpredict <- function(nbayes.obj, newdoc){
  if(ncol(nbayes.obj)!=2){stop("Must have 2 classes")}
  if(sum(rownames(nbayes.obj)==names(newdoc))!=nrow(nbayes.obj)){
    stop("terms do not line up")
  }
  ## turn the newdoc into a vector
  newdoc <- as.vector(newdoc)
  ## start the probability at 1
  plist <- rep(NA,length(newdoc))
  for(i in 1:length(newdoc)){ 
    pTermGivenA = nbayes.obj[i,1]^(newdoc[i])
    pTermGivenB = nbayes.obj[i,2]^(newdoc[i])
    plist[i] <- log(pTermGivenA/pTermGivenB)
  }
  return(sum(plist))
}

## END naive bayes functions
###########################################################

## Assumes that the current working directory is the top level of the dir
setwd("text/rawdata/EuroFinal14jul2010")

## sample 1000 briefs to use for the training
set.seed(123)
mysamp <- sample(list.files(), size=1000, replace=F)

currentdir <- getwd()
dir.create("../../training sample")
for(i in 1:length(mysamp)){
  oldpath <- paste(currentdir,"/",mysamp[i], sep="")
  newpath <- paste("../../training sample/",mysamp[i], sep="")
  file.create(newpath)
  file.copy(from=oldpath, to=newpath, overwrite=T)
}

## get the positive cases:

praise <- c("2005-0062PESC.txt","2009-0021PESC.txt")

for(i in 1:length(praise)){
  oldpath <- paste(currentdir,"/",praise[i], sep="")
  newpath <- paste("../../training sample/",praise[i], sep="")
  file.create(newpath)
  file.copy(from=oldpath, to=newpath, overwrite=T)
}

setwd("../../")
## Now, load up the 
dtm <- makedtm("C:/Users/Richard Nielsen/Desktop/Papers/Rewards for Ratification/archive/text/training sample/",
               tolower=T, stemming = T, stopwords = T, 
                    minWordLength = 1, removeNumbers = T, removePunctuation=T)

dtm <- makedtm("training sample/",
               tolower=T, stemming = T, stopwords = T, 
                    minWordLength = 1, removeNumbers = T, removePunctuation=T)
rownames(dtm) <- gsub("training sample/","",rownames(dtm),fixed=T)
dim(dtm)


## Removing terms that occur in less than one percent of the documents
## I got this to work (and output to csv) with .99
dtm2 <- removeSparseTerms(dtm, 0.99)
dim(dtm2)
dtm2 <- as.matrix(dtm2)


tmpdat <- cbind(rep(NA,nrow(dtm2)),dtm2)
dat <- as.data.frame(tmpdat)
head(dat)
names(dat)
names(dat)[1:10]
names(dat)[names(dat)=="V1"] <- "X"
names(dat)[1:10]

dat$X[rownames(dat) %in% gsub(".txt","",praise, fixed=T)] <- "B"
dat$X[!rownames(dat) %in% gsub(".txt","",praise, fixed=T)] <- "A"
table(dat$X)

## Run the model on the training data
out <- nbayes(mat = dat[,-1], class=dat$X)
ws <- out
## Time difference of 5.222817 mins
score <- apply(dat[,-1]/100, MAR=1,FUN=nbpredict,nbayes.obj=out) 
score <- score*-1

score[gsub(".txt","",praise, fixed=T)]

hist(score, breaks=50)
abline(v=score[gsub(".txt","",praise, fixed=T)][1], col="blue")
abline(v=score[gsub(".txt","",praise, fixed=T)][2], col="red")
  

## Now, run it on the full corpus of Euro briefs
getwd()
setwd("rawdata/EuroFinal14jul2010")
length(list.files())
## 34,335 documents

## I'm going to break it up and calculate the scores in 1000's

## make a holder
scoreholder <- c()
## make a sequence
ss <- seq(0,34000,1000)+1

tmpdir <- "../../tmp/"
dir.create(tmpdir)

## start the loop
for(k in ss){
  print(k)

  ## make the temp folder
  tmpdir <- "../../tmp/"
  #dir.create(tmpdir)

  ## clear out any old files
  allf <- list.files(tmpdir)
  for(j in allf){
    file.remove(paste(tmpdir,j,sep=""))
  }

  ## get the original files
  mysamp <- na.omit(list.files()[k:(k+999)])

  ## move them to the tmp folder
  for(i in 1:length(mysamp)){
    oldpath <- paste(currentdir,"/",mysamp[i], sep="")
    newpath <- paste(tmpdir,mysamp[i], sep="")
    file.create(newpath)
    file.copy(from=oldpath, to=newpath, overwrite=T)
  }

  ## make the dtm
  in.dtm <- makedtm(tmpdir,
               tolower=T, stemming = T, stopwords = T, 
                    minWordLength = 1, removeNumbers = T, removePunctuation=T)
  rownames(in.dtm) <- gsub("../../tmp/","",rownames(in.dtm),fixed=T)
  #dim(in.dtm)
  #inspect(in.dtm[1:10,1:10])

  ## get just the terms that are in the training data
  test <- as.data.frame(as.matrix(in.dtm[,rownames(out)[rownames(out)%in%colnames(in.dtm)]]),stringsAsFactors=F)
  dim(test)
  whichNotZero <- which(apply(test,MAR=1,sum)>0)
  in.dtm <- test[whichNotZero,]
  in.dtm <- lineup(dat, in.dtm)

  ## Calculate scores
  ## Dividing by 100 just divides the scores by 100, allowing scores
  tmp.score <- apply(in.dtm/100, MAR=1,FUN=nbpredict,nbayes.obj=out)
  
  ## add the scores to the vector
  scoreholder <- c(scoreholder,tmp.score)
}

length(scoreholder)

## After running the above, I saved it to this workspace
#save.image("../../praise scores.RData")
load("../../praise scores.RData")


scoreholder <- scoreholder*-1

hist(scoreholder, breaks=100)
abline(v=score[gsub(".txt","",praise, fixed=T)][1], col="blue")
abline(v=score[gsub(".txt","",praise, fixed=T)][2], col="red")

## How many documents got high scores?
sum(scoreholder > 0)
sum(scoreholder > .25)
sum(scoreholder > .5)
sum(scoreholder >= score[gsub(".txt","",praise, fixed=T)][1])

scoreholder[scoreholder >= score[gsub(".txt","",praise, fixed=T)][1]]

## hand code the documents

scorecode <- data.frame(score=names(rev(scoreholder[order(scoreholder)])),name=rev(scoreholder[order(scoreholder)]))

scorecode[1:10,]

## I just started at the top of the dataframe and went and read
## the individual text files in the order of their scores.
## I asked "did the EU praise or criticize a country about its
## accession (or failure thereof) to a human rights treaty or
## about its human rights practices in violation of a treaty
## or broad international norms of human rights?"

## 0 = no praise
## 1 = praise
## -1 = criticism

haspraise <- c(0,0,1,1,1,1,1,1,0,0,
               1,-1,1,-1,1,0,1,-1,0,0,
               1,-1,-1,0,-1,1,0,-1,-1,1,
               0,-1,0,-1,0,-1,-1,0,0,-1,
               -1,1,0,1,-1,-1,-1,0,0,-1,
               0,-1,-1,-1,0,1,0,0,0,1,
               -1,-1,0,-1,1,0,0,1,-1,0,
     ## Starting here, my criteria became more strict on it having to include reference to treaties
               0,0,0,0,0,0,0,-1,0,0,
               -1,1,-1,-1,0,-1,0,0,0,0,
               0,0,0,-1,0,0,0,0,0,0)

## Good example of criticism that evokes international commitments
## 2009-0129PESC.  Suggests that it is a two-edged sword.  You might
## get recognition but once you violate, you'll also get criticism that
## suggests you are being hypocritical.
## Also, 2009-0115PESC has one of the typical ones about iran
## executing minors.

## praise for abolition of death penalty in NJ is 2007-0110PESC




## Now, run the classifier just on the 822 docs that have scores
## greater than zero, with the actualy coded examples of praise 
## as the training:


pp <- as.character(scorecode$score[1:100][haspraise==1])

## First, I recode these to make sure they fit this definition:
## 1) they are praise
## 2) they mention ratification, accession, or compliance with/to
##    an international HRA.
## 0 means no
## .5 means as part of a larger brief
## 1 means the whole brief

real.praise <- c(.5,.5,.5,.5,
                 1,1,0,.5,
                 0,0,0,0,
                 1,0,1,1,
                 1,0,1,0)


## Then look for high ranking briefs that I haven't coded already
## in the earlier rounds
                 
rp <- pp[real.praise > 0]

posscore <- names(scoreholder[scoreholder>0])

## move them to a new folder
dir.create("../../likely subset")

mm <- c(rp,posscore)
for(i in 1:length(mm)){
  oldpath <- paste(currentdir,"/",mm[i],".txt", sep="")
  newpath <- paste("../../likely subset/",mm[i],".txt", sep="")
  file.create(newpath)
  file.copy(from=oldpath, to=newpath, overwrite=T)
}


dtml <- makedtm("../../likely subset",
               tolower=T, stemming = T, stopwords = T, 
                    minWordLength = 1, removeNumbers = T, removePunctuation=T)

dim(dtml)
inspect(dtml[1:10,1:10])

rownames(dtml) 


## Removing terms that occur in less than one percent of the documents
## I got this to work (and output to csv) with .99
dtml2 <- removeSparseTerms(dtml, 0.99)
dim(dtml2)
dtml2 <- as.matrix(dtml2)

tmpdat <- cbind(rep(NA,nrow(dtml2)),dtml2)
dat <- as.data.frame(tmpdat)
head(dat)
names(dat)
names(dat)[1:10]
names(dat)[names(dat)=="V1"] <- "X"
names(dat)[1:10]

dat$X[rownames(dat) %in% gsub(".txt","",rp, fixed=T)] <- "B"
dat$X[!rownames(dat) %in% gsub(".txt","",rp, fixed=T)] <- "A"
table(dat$X)
## Run the model on the training data
outl <- nbayes(mat = dat[,-1], class=dat$X)
## Time difference of 5.222817 mins
scorel <- apply(dat[,-1]/100, MAR=1,FUN=nbpredict,nbayes.obj=outl) 
scorel[rp]

hist(scorel, breaks=50)
abline(v=scorel[gsub(".txt","",rp, fixed=T)], col="blue")

## look at the most likely

length(scorel[scorel < 0])
length(scorel[scorel > 0])

## how many of those have I already coded?
as.character(scorecode$score[1:100])

## if T, then I've already looked at it
names(scorel[scorel < 0]) %in% as.character(scorecode$score[1:100])

## these are the ones to code
needcodes <- names(scorel[scorel < 0])[!(names(scorel[scorel < 0]) %in% as.character(scorecode$score[1:100]))]
needcodes

more.codes <- c(0,0,
                0,0,
                1,0,
                0,1,
                0,0,
                0,0,
                0,0,
                0,0,
                0,1,
                0,0,
                0,0,
                0,0,
                0,0,
                1,0,
                0,0,
                0)  

## and one more time, re-run the classifier with only the ones
## that spend a whole brief giving praise

wholepraise <- c(needcodes[more.codes==1], pp[real.praise ==1])


names(dat)[names(dat)=="V1"] <- "X"
names(dat)[1:10]

dat$X[rownames(dat) %in% gsub(".txt","",wholepraise, fixed=T)] <- "B"
dat$X[!rownames(dat) %in% gsub(".txt","",wholepraise, fixed=T)] <- "A"

## Run the model on the training data
outll <- nbayes(mat = dat[,-1], class=dat$X)
scorell <- apply(dat[,-1]/100, MAR=1,FUN=nbpredict,nbayes.obj=outll) 
scorell[wholepraise]

hist(scorell, breaks=50)
abline(v=scorell[gsub(".txt","",wholepraise, fixed=T)], col="blue")

## look at the most likely

length(scorell[scorell < 0])
length(scorell[scorell > 0])

## how many of those have I already coded?
c(as.character(scorecode$score[1:100]), needcodes)

## if T, then I've already looked at it
names(scorel[scorel < 0]) %in% c(as.character(scorecode$score[1:100]), needcodes)

## So, I've already looked at all of these.

length(wholepraise)

sort(wholepraise)

## Here's how many there are that praise in a full brief
length(c(needcodes[more.codes==1], pp[real.praise ==1]))

## here's how many have praise in whole or as part of a brief
length(c(needcodes[more.codes==1], pp[real.praise > 0]))

## how much criticism relative to praise?
sum(haspraise==1)
sum(haspraise==-1)
